NIH Ward Lab - Neurons Day8 - Preprocessing QC statistics ¶

July 2025 - Nancy Y¶

In [1]:
import io
import os
import sys
import pandas as pd
import contextlib
from IPython.display import display, Javascript

NOVA_HOME = '/home/projects/hornsteinlab/Collaboration/NOVA'
NOVA_DATA_HOME = '/home/projects/hornsteinlab/Collaboration/NOVA'
os.environ['NOVA_HOME'] = NOVA_HOME
sys.path.insert(1, os.getenv("NOVA_HOME"))
print(f"NOVA_HOME: {os.getenv('NOVA_HOME')}")


root_directory_raw = os.path.join(NOVA_DATA_HOME, 'input', 'images', 'raw', 'NIH', 'indi-image-pilot-20241128')
root_directory_proc = os.path.join(NOVA_DATA_HOME, 'input', 'images', 'processed', 'ManuscriptFinalData_80pct', 'NIH')

LOGS_PATH = os.path.join(NOVA_HOME, "outputs", "preprocessing", "ManuscriptFinalData_80pct", "NIH", "logs")
PLOT_PATH = os.path.join(NOVA_HOME, 'outputs', 'preprocessing', "ManuscriptFinalData_80pct", 'NIH', 'QC_figures')


from tools.preprocessing_tools.qc_reports.qc_utils import log_files_qc, run_validate_folder_structure, display_diff, sample_and_calc_variance, \
                                                show_site_survival_dapi_brenner, show_site_survival_dapi_cellpose, \
                                                show_site_survival_dapi_tiling, show_site_survival_target_brenner, \
                                                calc_total_sums, plot_filtering_heatmap, show_total_sum_tables, \
                                                plot_cell_count, plot_catplot, plot_hm_of_mean_cell_count_per_tile, \
                                                run_calc_hist_new, show_total_valid_tiles_per_marker_and_batch
                                                
from tools.preprocessing_tools.qc_reports.qc_config import NIH_d8_panels, NIH_d8_markers, NIH_d8_marker_info, NIH_d8_cell_lines, NIH_d8_cell_lines_to_cond,\
                                    NIH_d8_cell_lines_for_disp, NIH_d8_reps, NIH_d8_line_colors, NIH_d8_lines_order, NIH_d8_custom_palette,\
                                    NIH_d8_expected_dapi_raw
%load_ext autoreload
%autoreload 2
NOVA_HOME: /home/projects/hornsteinlab/Collaboration/NOVA
In [2]:
# choose batches
batches = [f'batch{i}' for i in range(1,4)]
batches
Out[2]:
['batch1', 'batch2', 'batch3']
In [3]:
df = log_files_qc(LOGS_PATH, only_wt_cond=False, batches=batches, filename_split='-',site_location=0)

#df['cell_line_cond'] = df['cell_line_cond'].str.replace(" ", "_")


df_dapi = df[df.marker=='DAPI']
df_target = df[df.marker!='DAPI']
reading logs of batch3
reading logs of batch2
reading logs of batch1

Total of 3 files were read.
Before dup handeling  (124974, 21)
After duplication removal #1: (124974, 22)
After duplication removal #2: (124974, 22)

Actual Files Validation¶

Raw Files Validation¶

  1. How many site tiff files do we have in each folder?
  2. Are all existing files valid? (tif, at least 2049kB, not corrupetd)
In [4]:
raws = run_validate_folder_structure(root_directory_raw, False, 
                                     NIH_d8_panels, 
                                     NIH_d8_markers.copy(),
                                     PLOT_PATH, 
                                     NIH_d8_marker_info,
                                     NIH_d8_cell_lines_to_cond, 
                                     NIH_d8_reps, 
                                     NIH_d8_cell_lines_for_disp,
                                     NIH_d8_expected_dapi_raw,
                                     batches=batches, 
                                     fig_width=8, fig_height = 40,
                                     expected_count=25, check_antibody=False)
batch1
Folder structure is valid.
No bad files are found.
Total Sites:  44000
========
batch2
Folder structure is valid.
No bad files are found.
Total Sites:  44000
========
batch3
Folder structure is valid.
No bad files are found.
Total Sites:  44000
========
====================
In [5]:
## Missing data issue was fixed

differences = (raws[0] != raws[1]).stack()
differences = differences[differences].index.to_frame(index=False)
differences.columns = ["Marker", "Rep", "Condition"]
for condition in differences["Condition"].unique():
    print(f"Condition: {condition}")
    condition_data = differences[differences["Condition"] == condition]
    for rep in condition_data["Rep"].unique():
        markers = condition_data[condition_data["Rep"] == rep]["Marker"].tolist()
        print(f"  Rep: {rep}")
        print(f"    Markers: {', '.join(markers)}")

Processed Files Validation¶

  1. How many site npy files do we have in each folder? -> How many sites survived the pre-processing?
  2. Are all existing files valid? (at least 100kB, npy not corrupted)
In [6]:
procs = run_validate_folder_structure(root_directory_proc, True, 
                                      NIH_d8_panels, 
                                      NIH_d8_markers,
                                      PLOT_PATH,
                                      NIH_d8_marker_info,
                                      NIH_d8_cell_lines_to_cond, 
                                      NIH_d8_reps, 
                                      NIH_d8_cell_lines_for_disp, 
                                      NIH_d8_expected_dapi_raw,
                                      fig_width=8, fig_height=40,
                                      expected_count=25, 
                                      check_antibody=False, 
                                      batches=batches)
batch1
Folder structure is valid.
No bad files are found.
Total Sites:  41546
========
batch2
Folder structure is valid.
No bad files are found.
Total Sites:  41780
========
batch3
Folder structure is valid.
No bad files are found.
Total Sites:  41565
========
====================

Difference between Raw and Processed¶

In [7]:
display_diff(batches, raws, procs, PLOT_PATH, fig_width=8, fig_height=40)
batch1
========
batch2
========
batch3
========

Variance in each batch (of processed files)¶

In [8]:
for batch in batches[:1]:
    with contextlib.redirect_stdout(io.StringIO()):
        var = sample_and_calc_variance(root_directory_proc, 
                                       batch, 
                                       sample_size_per_markers=50, 
                                       cond_count=2, 
                                       rep_count=len(NIH_d8_reps), 
                                       num_markers=len(NIH_d8_markers))
    print(f'{batch} var: ',var)
batch1 var:  0.030423036970459477

Preprocessing Filtering qc¶

By order of filtering

1. % site survival after Brenner on DAPI channel¶

Percentage out of the total sites

In [9]:
dapi_filter_by_brenner = show_site_survival_dapi_brenner(df_dapi,
                                                         batches, 
                                                         NIH_d8_line_colors, 
                                                         NIH_d8_panels, 
                                                         NIH_d8_reps, 
                                                         figsize=(6,18),
                                                         vmax=25)

2. % Site survival after Cellpose¶

Percentage out of the sites that passed the previous filter. In parenthesis are absolute values.

A site will be filtered out if Cellpose found 0 cells in it.

In [10]:
dapi_filter_by_cellpose = show_site_survival_dapi_cellpose(df_dapi, 
                                                           batches, 
                                                           dapi_filter_by_brenner, 
                                                           NIH_d8_line_colors, 
                                                           NIH_d8_panels, 
                                                           NIH_d8_reps, 
                                                           figsize=(6,18))

3. % Site survival by tiling¶

Percentage out of the sites that passed the previous filter. In parenthesis are absolute values.

A site will be filtered out if after tiling, no tile is containing at least one whole cell that Cellpose detected.

In [11]:
dapi_filter_by_tiling=show_site_survival_dapi_tiling(df_dapi, 
                                                     batches, 
                                                     dapi_filter_by_cellpose, 
                                                     NIH_d8_line_colors, 
                                                     NIH_d8_panels, 
                                                     NIH_d8_reps, 
                                                     figsize=(6,18))

4. % Site survival after Brenner on target channel¶

Percentage out of the sites that passed the previous filter. In parenthesis are absolute values (if different than the percentages).

In [12]:
show_site_survival_target_brenner(df_dapi, 
                                  df_target, 
                                  dapi_filter_by_tiling, 
                                  NIH_d8_markers,
                                  figsize=(6,18))

Statistics About the Processed Files¶

In [13]:
names = ['Total number of tiles', 'Total number of whole cells']
stats = ['n_valid_tiles','site_whole_cells_counts_sum','site_cell_count','site_cell_count_sum']
total_sum = calc_total_sums(df_target, df_dapi, stats, NIH_d8_markers)

Total tiles¶

In [14]:
# markers_for_dnls = markers.copy() #TODO need to change according to - if we use all markers or just the d8 ones!!!!
# markers_for_dnls.remove('TIA1')
# markers_for_dnls += ['TDP43B']

total_sum[total_sum.marker.isin(NIH_d8_markers)].n_valid_tiles.sum()
Out[14]:
1631945

Total whole nuclei in tiles¶

In [15]:
total_sum[total_sum.marker =='DAPI'].site_whole_cells_counts_sum.sum()
Out[15]:
309515.0

Total nuclei in sites¶

In [16]:
total_sum[total_sum.marker =='DAPI'].site_cell_count.sum()
Out[16]:
987203.0
In [17]:
show_total_sum_tables(total_sum)
n_valid_tiles % valid tiles site_whole_cells_counts_sum site_cell_count
batch1
count 1760.000000 1760.000000 1760.000000 1.760000e+03
mean 364.939773 3.649398 271.837500 8.704131e+02
std 79.588258 0.795883 73.568335 2.132589e+02
min 19.000000 0.190000 16.000000 4.500000e+01
25% 322.000000 3.220000 224.000000 7.367500e+02
50% 369.000000 3.690000 268.000000 8.710000e+02
75% 421.250000 4.212500 324.000000 1.017000e+03
max 570.000000 5.700000 476.000000 1.583000e+03
sum 642294.000000 NaN 478434.000000 1.531927e+06
expected_count 450.000000 450.000000 450.000000 4.500000e+02
n_valid_tiles % valid tiles site_whole_cells_counts_sum site_cell_count
batch2
count 1758.000000 1758.000000 1758.000000 1.758000e+03
mean 286.395336 2.863953 202.039249 6.480358e+02
std 68.996169 0.689962 54.437216 1.732157e+02
min 4.000000 0.040000 3.000000 1.100000e+01
25% 249.000000 2.490000 170.000000 5.490000e+02
50% 283.500000 2.835000 197.000000 6.370000e+02
75% 322.000000 3.220000 229.000000 7.220000e+02
max 510.000000 5.100000 383.000000 1.279000e+03
sum 503483.000000 NaN 355185.000000 1.139247e+06
expected_count 450.000000 450.000000 450.000000 4.500000e+02
n_valid_tiles % valid tiles site_whole_cells_counts_sum site_cell_count
batch3
count 1758.000000 1758.000000 1758.000000 1.758000e+03
mean 276.546075 2.765461 195.264505 6.156018e+02
std 77.778503 0.777785 61.463669 1.878487e+02
min 14.000000 0.140000 7.000000 3.300000e+01
25% 227.250000 2.272500 155.000000 4.930000e+02
50% 274.000000 2.740000 193.500000 6.020000e+02
75% 321.000000 3.210000 226.000000 7.097500e+02
max 504.000000 5.040000 421.000000 1.200000e+03
sum 486168.000000 NaN 343275.000000 1.082228e+06
expected_count 450.000000 450.000000 450.000000 4.500000e+02
n valid tiles % valid tiles site_whole_cells_counts_sum site_cell_count
All batches
count 5.276000e+03 5276.000000 5.276000e+03 5.276000e+03
mean 3.093148e+02 3.093148 2.230656e+02 7.114105e+02
std 8.531182e+01 0.853118 7.244806e+01 2.230372e+02
min 4.000000e+00 0.040000 3.000000e+00 1.100000e+01
25% 2.560000e+02 2.560000 1.750000e+02 5.660000e+02
50% 3.020000e+02 3.020000 2.140000e+02 6.830000e+02
75% 3.650000e+02 3.650000 2.640000e+02 8.550000e+02
max 5.700000e+02 5.700000 4.760000e+02 1.583000e+03
sum 1.631945e+06 NaN 1.176894e+06 3.753402e+06
expected_count 4.500000e+02 450.000000 4.500000e+02 4.500000e+02

Show Total Tile Counts¶

For each batch, cell line, replicate and marker: Total number of tiles

First, we look at all cell lines togther:¶

In [18]:
show_total_valid_tiles_per_marker_and_batch(total_sum, vmax=15000)

Separating into cell lines & batches:¶

In [19]:
to_heatmap = total_sum.rename(columns={'n_valid_tiles':'index'})
plot_filtering_heatmap(to_heatmap, 
                       extra_index='marker', 
                       vmin=None, vmax=None,
                       xlabel = 'Total number of tiles', 
                       show_sum=True, figsize=(7,28), 
                       fmt=".0f")
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:394: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:394: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:394: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:394: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:394: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:394: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)

Show Total Whole Cell Counts¶

For each batch, cell line, replicate and markerTotal number of tiles

In [20]:
to_heatmap = total_sum.rename(columns={'site_whole_cells_counts_sum':'index'})
plot_filtering_heatmap(to_heatmap, 
                       extra_index='marker', 
                       vmin=None, vmax=None,
                       xlabel = 'Total number of whole cells', 
                       show_sum=True, 
                       figsize=(7,28), 
                       fmt=".0f")
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:394: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:394: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:394: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:394: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:394: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:394: UserWarning: set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.
  ax.set_yticklabels(ax.get_yticklabels(), fontsize=6)

Show Cell Count Statistics per Batch¶

In [21]:
df_no_empty_sites = df_dapi[df_dapi.n_valid_tiles !=0]

plot_cell_count(df_no_empty_sites, 
                NIH_d8_lines_order, 
                NIH_d8_custom_palette, 
                y='site_cell_count_sum', 
                title='Cell Count Average per Site (from tiles)', 
                figsize=(16,6))


plot_cell_count(df_no_empty_sites, 
                NIH_d8_lines_order, 
                NIH_d8_custom_palette, 
                y='site_whole_cells_counts_sum',
                title='Whole Cell Count Average per Site',
                figsize=(16,6))


plot_cell_count(df_no_empty_sites, 
                NIH_d8_lines_order, 
                NIH_d8_custom_palette, 
                y='site_cell_count',
                title='Cellpose Cell Count Average per Site',
                figsize=(16,6))

Show Tiles per Site Statistics¶

In [22]:
df_dapi.groupby(['cell_line_cond']).n_valid_tiles.mean()
Out[22]:
cell_line_cond
FUSHeterozygous Untreated    13.855665
FUSHomozygous Untreated      13.848535
FUSRevertant Untreated       13.767027
WT Untreated                 11.889007
WT stress                    11.968573
Name: n_valid_tiles, dtype: float64
In [23]:
df_dapi[['site_cell_count']].mean()
Out[23]:
site_cell_count    30.081144
dtype: float64
In [24]:
plot_catplot(df_dapi, 
             NIH_d8_custom_palette,
             NIH_d8_reps, 
             x='n_valid_tiles', 
             x_title='valid tiles count', 
             batch_min=1, 
             batch_max=3, 
             height=6)
/home/projects/hornsteinlab/Collaboration/NOVA/tools/preprocessing_tools/qc_reports/qc_utils.py:1061: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'batch_rep'] = df['batch'] + " " + df['rep']

Show Mean of cell count in valid tiles¶

In [25]:
plot_hm_of_mean_cell_count_per_tile(df_dapi, 
                                    split_by='rep', 
                                    rows='cell_line_cond', 
                                    columns='panel', 
                                    figsize=(18,6))

Assessing Staining Reproducibility and Outliers¶

In [26]:
# for batch in batches:
#     print(batch)
#     run_calc_hist_new(f'{batch}', dnls_opera_cell_lines_for_disp, dnls_opera_markers,
#                       root_directory_raw, root_directory_proc,
#                            hist_sample=10,sample_size_per_markers=200, ncols=8, nrows=4, dnls=True)
#     print("="*30)
In [ ]:
# save notebook as HTML 
from IPython.display import display, Javascript
display(Javascript('IPython.notebook.save_checkpoint();'))
os.system(f'jupyter nbconvert --to html {NOVA_HOME}/tools/preprocessing_tools/qc_reports/qc_report_NIH_NeuronsDay8.ipynb --output {NOVA_HOME}/manuscript/preprocessing_qc_reports/ManuscriptFinalData/qc_report_NIH_NeuronsDay8.html')
In [ ]: